import polars as pl
import sqlite3
import plotly.express as px
import missingno as msno

import using polars (something new…)

jobs = pl.read_csv("lightcast_job_postings.csv")
print(jobs.shape)
(72476, 131)
print(jobs.columns)
['ID', 'LAST_UPDATED_DATE', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL', 'ACTIVE_URLS', 'ACTIVE_SOURCES_INFO', 'TITLE_RAW', 'BODY', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_RAW', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION', 'CITY', 'CITY_NAME', 'COUNTY', 'COUNTY_NAME', 'MSA', 'MSA_NAME', 'STATE', 'STATE_NAME', 'COUNTY_OUTGOING', 'COUNTY_NAME_OUTGOING', 'COUNTY_INCOMING', 'COUNTY_NAME_INCOMING', 'MSA_OUTGOING', 'MSA_NAME_OUTGOING', 'MSA_INCOMING', 'MSA_NAME_INCOMING', 'NAICS2', 'NAICS2_NAME', 'NAICS3', 'NAICS3_NAME', 'NAICS4', 'NAICS4_NAME', 'NAICS5', 'NAICS5_NAME', 'NAICS6', 'NAICS6_NAME', 'TITLE', 'TITLE_NAME', 'TITLE_CLEAN', 'SKILLS', 'SKILLS_NAME', 'SPECIALIZED_SKILLS', 'SPECIALIZED_SKILLS_NAME', 'CERTIFICATIONS', 'CERTIFICATIONS_NAME', 'COMMON_SKILLS', 'COMMON_SKILLS_NAME', 'SOFTWARE_SKILLS', 'SOFTWARE_SKILLS_NAME', 'ONET', 'ONET_NAME', 'ONET_2019', 'ONET_2019_NAME', 'CIP6', 'CIP6_NAME', 'CIP4', 'CIP4_NAME', 'CIP2', 'CIP2_NAME', 'SOC_2021_2', 'SOC_2021_2_NAME', 'SOC_2021_3', 'SOC_2021_3_NAME', 'SOC_2021_4', 'SOC_2021_4_NAME', 'SOC_2021_5', 'SOC_2021_5_NAME', 'LOT_CAREER_AREA', 'LOT_CAREER_AREA_NAME', 'LOT_OCCUPATION', 'LOT_OCCUPATION_NAME', 'LOT_SPECIALIZED_OCCUPATION', 'LOT_SPECIALIZED_OCCUPATION_NAME', 'LOT_OCCUPATION_GROUP', 'LOT_OCCUPATION_GROUP_NAME', 'LOT_V6_SPECIALIZED_OCCUPATION', 'LOT_V6_SPECIALIZED_OCCUPATION_NAME', 'LOT_V6_OCCUPATION', 'LOT_V6_OCCUPATION_NAME', 'LOT_V6_OCCUPATION_GROUP', 'LOT_V6_OCCUPATION_GROUP_NAME', 'LOT_V6_CAREER_AREA', 'LOT_V6_CAREER_AREA_NAME', 'SOC_2', 'SOC_2_NAME', 'SOC_3', 'SOC_3_NAME', 'SOC_4', 'SOC_4_NAME', 'SOC_5', 'SOC_5_NAME', 'LIGHTCAST_SECTORS', 'LIGHTCAST_SECTORS_NAME', 'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3', 'NAICS_2022_3_NAME', 'NAICS_2022_4', 'NAICS_2022_4_NAME', 'NAICS_2022_5', 'NAICS_2022_5_NAME', 'NAICS_2022_6', 'NAICS_2022_6_NAME']
schema = pl.DataFrame({
    'Column': jobs.columns,
    'Data Type': jobs.dtypes,
    'Unique Values': [jobs.select(pl.col(col).n_unique()).item() for col in jobs.columns]
})

print(schema.shape)
(131, 3)
print(schema.head(20))
shape: (20, 3)
┌────────────────────────┬───────────┬───────────────┐
│ Column                 ┆ Data Type ┆ Unique Values │
│ ---                    ┆ ---       ┆ ---           │
│ str                    ┆ object    ┆ i64           │
╞════════════════════════╪═══════════╪═══════════════╡
│ ID                     ┆ String    ┆ 72476         │
│ LAST_UPDATED_DATE      ┆ String    ┆ 169           │
│ LAST_UPDATED_TIMESTAMP ┆ String    ┆ 174           │
│ DUPLICATES             ┆ Int64     ┆ 72            │
│ POSTED                 ┆ String    ┆ 153           │
│ …                      ┆ …         ┆ …             │
│ MODELED_DURATION       ┆ Int64     ┆ 61            │
│ COMPANY                ┆ Int64     ┆ 12303         │
│ COMPANY_NAME           ┆ String    ┆ 12302         │
│ COMPANY_RAW            ┆ String    ┆ 17214         │
│ COMPANY_IS_STAFFING    ┆ Boolean   ┆ 2             │
└────────────────────────┴───────────┴───────────────┘

some cleaning up first (round 1)…

columns_to_drop = [
    'ID', 'URL', 'ACTIVE_URLS', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES', 
    'NAICS2', 'NAICS2_NAME', 'NAICS3', 'NAICS3_NAME', 'NAICS4', 'NAICS4_NAME', 
    'NAICS5', 'NAICS5_NAME', 'NAICS6', 'NAICS6_NAME', 
    'SOC_2021_2', 'SOC_2021_2_NAME', 'SOC_2021_3', 'SOC_2021_3_NAME', 
    'SOC_2021_4', 'SOC_2021_4_NAME', 'SOC_2', 'SOC_2_NAME', 
    'SOC_3', 'SOC_3_NAME', 'SOC_4', 'SOC_4_NAME', 'SOC_5', 'SOC_5_NAME', 
    'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3', 'NAICS_2022_3_NAME', 
    'NAICS_2022_4', 'NAICS_2022_4_NAME', 'NAICS_2022_5', 'NAICS_2022_5_NAME'
]

jobs_filtered = jobs.drop(columns_to_drop)

print(jobs_filtered.columns)
['LAST_UPDATED_DATE', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'ACTIVE_SOURCES_INFO', 'TITLE_RAW', 'BODY', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_RAW', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION', 'CITY', 'CITY_NAME', 'COUNTY', 'COUNTY_NAME', 'MSA', 'MSA_NAME', 'STATE', 'STATE_NAME', 'COUNTY_OUTGOING', 'COUNTY_NAME_OUTGOING', 'COUNTY_INCOMING', 'COUNTY_NAME_INCOMING', 'MSA_OUTGOING', 'MSA_NAME_OUTGOING', 'MSA_INCOMING', 'MSA_NAME_INCOMING', 'TITLE', 'TITLE_NAME', 'TITLE_CLEAN', 'SKILLS', 'SKILLS_NAME', 'SPECIALIZED_SKILLS', 'SPECIALIZED_SKILLS_NAME', 'CERTIFICATIONS', 'CERTIFICATIONS_NAME', 'COMMON_SKILLS', 'COMMON_SKILLS_NAME', 'SOFTWARE_SKILLS', 'SOFTWARE_SKILLS_NAME', 'ONET', 'ONET_NAME', 'ONET_2019', 'ONET_2019_NAME', 'CIP6', 'CIP6_NAME', 'CIP4', 'CIP4_NAME', 'CIP2', 'CIP2_NAME', 'SOC_2021_5', 'SOC_2021_5_NAME', 'LOT_CAREER_AREA', 'LOT_CAREER_AREA_NAME', 'LOT_OCCUPATION', 'LOT_OCCUPATION_NAME', 'LOT_SPECIALIZED_OCCUPATION', 'LOT_SPECIALIZED_OCCUPATION_NAME', 'LOT_OCCUPATION_GROUP', 'LOT_OCCUPATION_GROUP_NAME', 'LOT_V6_SPECIALIZED_OCCUPATION', 'LOT_V6_SPECIALIZED_OCCUPATION_NAME', 'LOT_V6_OCCUPATION', 'LOT_V6_OCCUPATION_NAME', 'LOT_V6_OCCUPATION_GROUP', 'LOT_V6_OCCUPATION_GROUP_NAME', 'LOT_V6_CAREER_AREA', 'LOT_V6_CAREER_AREA_NAME', 'LIGHTCAST_SECTORS', 'LIGHTCAST_SECTORS_NAME', 'NAICS_2022_6', 'NAICS_2022_6_NAME']
jobs_filtered.shape
(72476, 94)
nan_counts = jobs_filtered.null_count()
# looking at all columns

nan_counts_long = nan_counts.unpivot().rename({"variable": "Column", "value": "NaN Count"})

print(nan_counts_long.shape)
(94, 2)
print(nan_counts_long.head(20))
shape: (20, 2)
┌───────────────────────┬───────────┐
│ Column                ┆ NaN Count │
│ ---                   ┆ ---       │
│ str                   ┆ u32       │
╞═══════════════════════╪═══════════╡
│ LAST_UPDATED_DATE     ┆ 0         │
│ POSTED                ┆ 0         │
│ EXPIRED               ┆ 7822      │
│ DURATION              ┆ 27294     │
│ SOURCE_TYPES          ┆ 0         │
│ …                     ┆ …         │
│ EDUCATION_LEVELS      ┆ 0         │
│ EDUCATION_LEVELS_NAME ┆ 0         │
│ MIN_EDULEVELS         ┆ 0         │
│ MIN_EDULEVELS_NAME    ┆ 0         │
│ MAX_EDULEVELS         ┆ 56155     │
└───────────────────────┴───────────┘
# testing the msno heatmap

jobs_filtered_pd = jobs_filtered.to_pandas()

msno.heatmap(jobs_filtered_pd)

# showing only columns with NaN values

nan_counts = jobs_filtered.null_count()
total_rows = jobs_filtered.height

nan_counts_filtered = (
    nan_counts.unpivot()
    .rename({"variable": "Column", "value": "NaN Count"})
    .filter(pl.col("NaN Count") > 0)
    .with_columns(
        (pl.col("NaN Count") / total_rows * 100).alias("NaN Percentage")
    )
)

print(nan_counts_filtered.to_pandas())
                    Column  NaN Count  NaN Percentage
0                  EXPIRED       7822       10.792538
1                 DURATION      27294       37.659363
2      ACTIVE_SOURCES_INFO      64654       89.207462
3                TITLE_RAW         60        0.082786
4          MODELED_EXPIRED      15383       21.224957
5         MODELED_DURATION      19261       26.575694
6              COMPANY_RAW        497        0.685744
7            MAX_EDULEVELS      56155       77.480821
8       MAX_EDULEVELS_NAME      56155       77.480821
9     MIN_YEARS_EXPERIENCE      23113       31.890557
10    MAX_YEARS_EXPERIENCE      64046       88.368563
11                  SALARY      41658       57.478338
12     ORIGINAL_PAY_PERIOD      40068       55.284508
13               SALARY_TO      40068       55.284508
14             SALARY_FROM      40068       55.284508
15                     MSA       3908        5.392130
16                MSA_NAME       3908        5.392130
17            MSA_OUTGOING       3908        5.392130
18       MSA_NAME_OUTGOING       3908        5.392130
19            MSA_INCOMING       3921        5.410067
20       MSA_NAME_INCOMING       3921        5.410067
21             TITLE_CLEAN         96        0.132458
22       LIGHTCAST_SECTORS      54682       75.448424
23  LIGHTCAST_SECTORS_NAME      54682       75.448424
# visualize

nan_counts_filtered_pd = nan_counts_filtered.to_pandas()

fig = px.bar(
    nan_counts_filtered_pd,
    x="Column",
    y="NaN Percentage",
    color="NaN Percentage",
    title="NaN Percentages per Column",
    color_continuous_scale="Viridis"
)

fig.update_layout(width=1200, height=500, xaxis_tickangle=-45)
fig.show()

a bit more cleaning up (round 2)…

# removing columns comprised of 50% or higher NaN values (except SALARY, SALARY_FROM, SALARY_TO)

columns_to_drop = [
    "ACTIVE_SOURCES_INFO", "MAX_EDULEVELS", "MAX_EDULEVELS_NAME", "MAX_YEARS_EXPERIENCE", 
    "ORIGINAL_PAY_PERIOD", "LIGHTCAST_SECTORS", "LIGHTCAST_SECTORS_NAME"
]

jobs_filtered_2 = jobs_filtered.drop(columns_to_drop)
print(jobs_filtered_2.shape)
(72476, 87)
# and also remove duplicates

jobs_filtered_2 = jobs_filtered_2.unique(subset=["TITLE", "COMPANY", "LOCATION", "POSTED"], keep="first")
print(jobs_filtered_2.shape)
(69200, 87)
nan_counts = jobs_filtered_2.null_count()
total_rows = jobs_filtered_2.height 

nan_counts_filtered = (
    nan_counts.unpivot()
    .rename({"variable": "Column", "value": "NaN Count"})
    .filter(pl.col("NaN Count") > 0)  
    .with_columns(
        (pl.col("NaN Count") / total_rows * 100).alias("NaN Percentage")
    )
)

print(nan_counts_filtered.to_pandas())
                  Column  NaN Count  NaN Percentage
0                EXPIRED       7462       10.783237
1               DURATION      26092       37.705202
2              TITLE_RAW         54        0.078035
3        MODELED_EXPIRED      14739       21.299133
4       MODELED_DURATION      18401       26.591040
5            COMPANY_RAW        489        0.706647
6   MIN_YEARS_EXPERIENCE      22339       32.281792
7                 SALARY      39954       57.736994
8              SALARY_TO      38490       55.621387
9            SALARY_FROM      38490       55.621387
10                   MSA       2806        4.054913
11              MSA_NAME       2806        4.054913
12          MSA_OUTGOING       2806        4.054913
13     MSA_NAME_OUTGOING       2806        4.054913
14          MSA_INCOMING       2816        4.069364
15     MSA_NAME_INCOMING       2816        4.069364
16           TITLE_CLEAN         88        0.127168
# insight into remote jobs

remote_type_series = jobs_filtered_2.get_column("REMOTE_TYPE_NAME")

value_counts = remote_type_series.value_counts(sort=True)

print(value_counts)
shape: (4, 2)
┌──────────────────┬───────┐
│ REMOTE_TYPE_NAME ┆ count │
│ ---              ┆ ---   │
│ str              ┆ u32   │
╞══════════════════╪═══════╡
│ [None]           ┆ 54211 │
│ Remote           ┆ 11745 │
│ Hybrid Remote    ┆ 2151  │
│ Not Remote       ┆ 1093  │
└──────────────────┴───────┘

go ahead with a SQL connection…

conn = sqlite3.connect(':memory:')

conn.execute("DROP TABLE IF EXISTS jobs;")

columns = ", ".join([f"{col} TEXT" for col in jobs_filtered_2.columns])
create_table_query = f"CREATE TABLE jobs ({columns});"
conn.execute(create_table_query)

insert_query = f"INSERT INTO jobs VALUES ({', '.join(['?'] * len(jobs_filtered_2.columns))})"

conn.executemany(insert_query, jobs_filtered_2.to_numpy().tolist())

conn.commit()

print("Data from jobs_filtered_2 has been successfully inserted into the SQLite database.")
Data from jobs_filtered_2 has been successfully inserted into the SQLite database.
query_count = """
SELECT COUNT(*) AS TotalCount
FROM jobs;
"""

cursor = conn.cursor()
cursor.execute(query_count)
result = cursor.fetchall()

total_count = pl.DataFrame(result, schema=["TotalCount"])

print(total_count)
shape: (1, 1)
┌────────────┐
│ TotalCount │
│ ---        │
│ i64        │
╞════════════╡
│ 69200      │
└────────────┘

by NAICS_2022_6_NAME

unique_naics_count = jobs_filtered_2.select(pl.col("NAICS_2022_6_NAME").n_unique()).to_numpy()

print(f"Unique NAICS_2022_6_NAME count: {unique_naics_count[0][0]}")
Unique NAICS_2022_6_NAME count: 814
# We're interseted in the finance, marketing, and transportation industries

query_filtered_naics_count = """
SELECT COUNT(DISTINCT NAICS_2022_6_NAME) AS Unique_NAICS_Count
FROM jobs
WHERE NAICS_2022_6_NAME LIKE '%finance%'
   OR NAICS_2022_6_NAME LIKE '%banking%'
   OR NAICS_2022_6_NAME LIKE '%investing%'
   OR NAICS_2022_6_NAME LIKE '%investments%'
   OR NAICS_2022_6_NAME LIKE '%marketing%'
   OR NAICS_2022_6_NAME LIKE '%transportation%';
"""

cursor = conn.cursor()
cursor.execute(query_filtered_naics_count)
result = cursor.fetchall()

unique_filtered_naics_count_df = pl.DataFrame(result, schema=["Unique_NAICS_Count"])

print(unique_filtered_naics_count_df)
shape: (1, 1)
┌────────────────────┐
│ Unique_NAICS_Count │
│ ---                │
│ i64                │
╞════════════════════╡
│ 28                 │
└────────────────────┘
pl.Config.set_tbl_rows(28)  

query_filtered_naics = """
SELECT DISTINCT NAICS_2022_6_NAME
FROM jobs
WHERE NAICS_2022_6_NAME LIKE '%finance%'
   OR NAICS_2022_6_NAME LIKE '%banking%'
   OR NAICS_2022_6_NAME LIKE '%investing%'
   OR NAICS_2022_6_NAME LIKE '%investments%'
   OR NAICS_2022_6_NAME LIKE '%marketing%'
   OR NAICS_2022_6_NAME LIKE '%transportation%';
"""

cursor = conn.cursor()
cursor.execute(query_filtered_naics)
result = cursor.fetchall()

filtered_naics_df = pl.DataFrame(result, schema=["NAICS_2022_6_NAME"], orient="row")

print(filtered_naics_df.shape)
(28, 1)
print(filtered_naics_df)
shape: (28, 1)
┌─────────────────────────────────┐
│ NAICS_2022_6_NAME               │
│ ---                             │
│ str                             │
╞═════════════════════════════════╡
│ Commercial Banking              │
│ Scheduled Passenger Air Transp… │
│ Telemarketing Bureaus and Othe… │
│ Investment Banking and Securit… │
│ Marketing Consulting Services   │
│ Freight Transportation Arrange… │
│ Regulation of Agricultural Mar… │
│ Marketing Research and Public … │
│ Special Needs Transportation    │
│ Regulation and Administration … │
│ Nonscheduled Chartered Freight… │
│ Scenic and Sightseeing Transpo… │
│ Other Support Activities for A… │
│ Public Finance Activities       │
│ Nonscheduled Chartered Passeng… │
│ Pipeline Transportation of Nat… │
│ School and Employee Bus Transp… │
│ Scheduled Freight Air Transpor… │
│ Pipeline Transportation of Cru… │
│ All Other Pipeline Transportat… │
│ Inland Water Freight Transport… │
│ Deep Sea Freight Transportatio… │
│ All Other Transit and Ground P… │
│ Pipeline Transportation of Ref… │
│ Transportation Equipment and S… │
│ Support Activities for Rail Tr… │
│ All Other Transportation Equip… │
│ Other Support Activities for R… │
└─────────────────────────────────┘
#drop any filtered NAICS_2022_6_NAME(s) without salary information

query_avg_salary_by_filtered_naics = """
SELECT 
    NAICS_2022_6_NAME,
    COUNT(NAICS_2022_6_NAME) AS Occurrences,
    AVG(SALARY) AS Avg_Salary,
    AVG(SALARY_FROM) AS Avg_Salary_From,
    AVG(SALARY_TO) AS Avg_Salary_To
FROM jobs
WHERE SALARY IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM IS NOT NULL
  AND (NAICS_2022_6_NAME LIKE '%finance%'
   OR NAICS_2022_6_NAME LIKE '%banking%'
   OR NAICS_2022_6_NAME LIKE '%investing%'
   OR NAICS_2022_6_NAME LIKE '%investments%'
   OR NAICS_2022_6_NAME LIKE '%marketing%'
   OR NAICS_2022_6_NAME LIKE '%transportation%')
GROUP BY NAICS_2022_6_NAME
ORDER BY Occurrences DESC;
"""

cursor = conn.cursor()
cursor.execute(query_avg_salary_by_filtered_naics)
result = cursor.fetchall()

pl.Config.set_tbl_rows(28) 
pl.Config.set_tbl_cols(None) 

avg_salary_by_filtered_naics_df = pl.DataFrame(result, schema=["NAICS_2022_6_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")

print(avg_salary_by_filtered_naics_df.shape)
(23, 5)
# NOTE: drops 5

print(avg_salary_by_filtered_naics_df)
shape: (23, 5)
┌─────────────────────────────────┬─────────────┬───────────────┬─────────────────┬───────────────┐
│ NAICS_2022_6_NAME               ┆ Occurrences ┆ Avg_Salary    ┆ Avg_Salary_From ┆ Avg_Salary_To │
│ ---                             ┆ ---         ┆ ---           ┆ ---             ┆ ---           │
│ str                             ┆ i64         ┆ f64           ┆ f64             ┆ f64           │
╞═════════════════════════════════╪═════════════╪═══════════════╪═════════════════╪═══════════════╡
│ Commercial Banking              ┆ 922         ┆ 127314.572668 ┆ 103130.91757    ┆ 151398.733189 │
│ Telemarketing Bureaus and Othe… ┆ 201         ┆ 90259.129353  ┆ 72430.199005    ┆ 107765.024876 │
│ Investment Banking and Securit… ┆ 94          ┆ 120806.776596 ┆ 95981.223404    ┆ 144876.382979 │
│ Marketing Consulting Services   ┆ 48          ┆ 79475.604167  ┆ 69490.916667    ┆ 87989.833333  │
│ Regulation and Administration … ┆ 28          ┆ 93925.928571  ┆ 76147.75        ┆ 111066.857143 │
│ Scheduled Passenger Air Transp… ┆ 18          ┆ 121229.555556 ┆ 105449.777778   ┆ 137009.444444 │
│ Marketing Research and Public … ┆ 14          ┆ 90237.857143  ┆ 73339.285714    ┆ 107136.428571 │
│ Public Finance Activities       ┆ 13          ┆ 108434.923077 ┆ 85725.0         ┆ 131145.153846 │
│ Regulation of Agricultural Mar… ┆ 11          ┆ 85541.909091  ┆ 73135.454545    ┆ 97948.636364  │
│ Freight Transportation Arrange… ┆ 10          ┆ 79420.3       ┆ 70311.1         ┆ 88529.8       │
│ Other Support Activities for A… ┆ 6           ┆ 82500.0       ┆ 71333.333333    ┆ 93666.666667  │
│ Nonscheduled Chartered Freight… ┆ 6           ┆ 108785.333333 ┆ 91843.166667    ┆ 125727.666667 │
│ Pipeline Transportation of Nat… ┆ 3           ┆ 98722.666667  ┆ 76333.333333    ┆ 120000.0      │
│ Special Needs Transportation    ┆ 2           ┆ 62565.5       ┆ 57566.0         ┆ 65286.5       │
│ School and Employee Bus Transp… ┆ 2           ┆ 62650.0       ┆ 61090.0         ┆ 64210.0       │
│ Scenic and Sightseeing Transpo… ┆ 2           ┆ 80069.5       ┆ 67591.0         ┆ 92548.0       │
│ All Other Transit and Ground P… ┆ 2           ┆ 90675.0       ┆ 77500.0         ┆ 103850.0      │
│ Support Activities for Rail Tr… ┆ 1           ┆ 150000.0      ┆ 140000.0        ┆ 160000.0      │
│ Scheduled Freight Air Transpor… ┆ 1           ┆ 62400.0       ┆ 62400.0         ┆ 62400.0       │
│ Other Support Activities for R… ┆ 1           ┆ 55000.0       ┆ 50000.0         ┆ 60000.0       │
│ Nonscheduled Chartered Passeng… ┆ 1           ┆ 47008.0       ┆ 47008.0         ┆ 47008.0       │
│ Deep Sea Freight Transportatio… ┆ 1           ┆ 131400.0      ┆ 111300.0        ┆ 151500.0      │
│ All Other Transportation Equip… ┆ 1           ┆ 89523.0       ┆ 73299.0         ┆ 105747.0      │
└─────────────────────────────────┴─────────────┴───────────────┴─────────────────┴───────────────┘

by TITLE_NAME

unique_title_count = jobs_filtered_2.select(pl.col("TITLE_NAME").n_unique()).to_numpy()

print(f"Unique TITLE_NAME count: {unique_title_count[0][0]}")
Unique TITLE_NAME count: 5720
# We're interested in analyst positions

pl.Config.set_tbl_rows(5000)

query_filtered_title = """
SELECT DISTINCT TITLE_NAME
FROM jobs
WHERE TITLE_NAME LIKE '%analyst%'
   OR TITLE_NAME LIKE '%analytics%'
   OR TITLE_NAME LIKE '%data%';
"""

cursor = conn.cursor()
cursor.execute(query_filtered_title)
result = cursor.fetchall()

filtered_title_df = pl.DataFrame(result, schema=["TITLE_NAME"], orient="row")

print(filtered_title_df.shape)
(1476, 1)
print(filtered_title_df.head(20))
shape: (20, 1)
┌────────────────────────────────┐
│ TITLE_NAME                     │
│ ---                            │
│ str                            │
╞════════════════════════════════╡
│ Data Analysts                  │
│ Oracle Analysts                │
│ Data Reporting Analysts        │
│ Data Management Analysts       │
│ Health Data Analysts           │
│ Lead Data Analysts             │
│ Customer Experience Analysts   │
│ Data Analytics Leads           │
│ WFM Analysts                   │
│ ERP Business Analysts          │
│ Business Intelligence Analysts │
│ Oracle Database Administrators │
│ Oracle Functional Analysts     │
│ Principal Data Scientists      │
│ Oracle Programmer Analysts     │
│ Enterprise Data Architects     │
│ Data and Reporting Analysts    │
│ ERP Systems Analysts           │
│ SAP Functional Analysts        │
│ Fraud Analysts                 │
└────────────────────────────────┘
#drop any filtered TITLE_NAMES(s) without salary information

query_avg_salary_by_filtered_title = """
SELECT 
    TITLE_NAME,
    COUNT(TITLE_NAME) AS Occurrences,
    AVG(SALARY) AS Avg_Salary,
    AVG(SALARY_FROM) AS Avg_Salary_From,
    AVG(SALARY_TO) AS Avg_Salary_To
FROM jobs
WHERE SALARY IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM IS NOT NULL
  AND (TITLE_NAME LIKE '%analyst%'
   OR TITLE_NAME LIKE '%analytics%'
   OR TITLE_NAME LIKE '%data%')
GROUP BY TITLE_NAME
ORDER BY Occurrences DESC;
"""

cursor = conn.cursor()
cursor.execute(query_avg_salary_by_filtered_title)
result = cursor.fetchall()

pl.Config.set_tbl_rows(1500) 
pl.Config.set_tbl_cols(None) 

avg_salary_by_filtered_title_df = pl.DataFrame(result, schema=["TITLE_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")

print(avg_salary_by_filtered_title_df.shape)
(1005, 5)
# NOTE: drops 471

print(avg_salary_by_filtered_title_df.head(20))
shape: (20, 5)
┌─────────────────────────────────┬─────────────┬───────────────┬─────────────────┬───────────────┐
│ TITLE_NAME                      ┆ Occurrences ┆ Avg_Salary    ┆ Avg_Salary_From ┆ Avg_Salary_To │
│ ---                             ┆ ---         ┆ ---           ┆ ---             ┆ ---           │
│ str                             ┆ i64         ┆ f64           ┆ f64             ┆ f64           │
╞═════════════════════════════════╪═════════════╪═══════════════╪═════════════════╪═══════════════╡
│ Data Analysts                   ┆ 3467        ┆ 93825.56533   ┆ 81035.9152      ┆ 106047.649841 │
│ Business Intelligence Analysts  ┆ 938         ┆ 103024.716418 ┆ 87234.638593    ┆ 118629.098081 │
│ Data Analytics Engineers        ┆ 325         ┆ 185512.461538 ┆ 161054.243077   ┆ 209111.68     │
│ Data and Reporting Analysts     ┆ 290         ┆ 79633.92069   ┆ 66126.458621    ┆ 91923.089655  │
│ Data Governance Analysts        ┆ 228         ┆ 111827.557018 ┆ 94360.548246    ┆ 128478.942982 │
│ Data Quality Analysts           ┆ 193         ┆ 104164.165803 ┆ 91157.621762    ┆ 116710.108808 │
│ Data Analytics Analysts         ┆ 192         ┆ 109418.296875 ┆ 87121.588542    ┆ 131655.473958 │
│ Data Management Analysts        ┆ 178         ┆ 107976.398876 ┆ 92009.02809     ┆ 123313.651685 │
│ Data Modelers                   ┆ 172         ┆ 136975.895349 ┆ 118876.593023   ┆ 155075.238372 │
│ Lead Data Analysts              ┆ 170         ┆ 111721.929412 ┆ 95335.964706    ┆ 127742.011765 │
│ Research Data Analysts          ┆ 166         ┆ 81899.710843  ┆ 69225.656627    ┆ 94308.771084  │
│ IT Data Analytics Analysts      ┆ 164         ┆ 107843.939024 ┆ 84157.25        ┆ 131339.04878  │
│ Lead Business Intelligence Ana… ┆ 151         ┆ 120744.960265 ┆ 100244.437086   ┆ 140527.245033 │
│ Data Science Analysts           ┆ 133         ┆ 114863.992481 ┆ 96807.609023    ┆ 132389.849624 │
│ Data Analytics Leads            ┆ 120         ┆ 162420.35     ┆ 120716.908333   ┆ 203772.808333 │
│ Data Operations Analysts        ┆ 114         ┆ 88823.359649  ┆ 73918.745614    ┆ 102379.657895 │
│ Business Intelligence Data Ana… ┆ 113         ┆ 103099.938053 ┆ 87012.858407    ┆ 119073.752212 │
│ Health Data Analysts            ┆ 107         ┆ 91458.654206  ┆ 74872.214953    ┆ 107990.401869 │
│ Data and Analytics Consultants  ┆ 107         ┆ 127020.224299 ┆ 94460.691589    ┆ 158536.672897 │
│ Enterprise Data Architects      ┆ 106         ┆ 166127.603774 ┆ 146011.613208   ┆ 186243.622642 │
└─────────────────────────────────┴─────────────┴───────────────┴─────────────────┴───────────────┘

now use both of the queries…

# Analyst positions within the Industries we're interested in

query_avg_salary_by_filtered_title_naics = """
SELECT 
    TITLE_NAME,
    NAICS_2022_6_NAME,
    COUNT(TITLE_NAME) AS Occurrences,
    AVG(SALARY) AS Avg_Salary,
    AVG(SALARY_FROM) AS Avg_Salary_From,
    AVG(SALARY_TO) AS Avg_Salary_To
FROM jobs
WHERE SALARY IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM IS NOT NULL
  AND (TITLE_NAME LIKE '%analyst%'
   OR TITLE_NAME LIKE '%analytics%'
   OR TITLE_NAME LIKE '%data%')
  AND (NAICS_2022_6_NAME LIKE '%finance%'
   OR NAICS_2022_6_NAME LIKE '%banking%'
   OR NAICS_2022_6_NAME LIKE '%investing%'
   OR NAICS_2022_6_NAME LIKE '%investments%'
   OR NAICS_2022_6_NAME LIKE '%marketing%'
   OR NAICS_2022_6_NAME LIKE '%transportation%')
GROUP BY TITLE_NAME, NAICS_2022_6_NAME
ORDER BY Occurrences DESC;
"""

cursor = conn.cursor()
cursor.execute(query_avg_salary_by_filtered_title_naics)
result = cursor.fetchall()

pl.Config.set_tbl_rows(250) 
pl.Config.set_tbl_cols(None) 

avg_salary_by_filtered_title_naics_df = pl.DataFrame(result, schema=["TITLE_NAME", "NAICS_2022_6_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")

print(avg_salary_by_filtered_title_naics_df.shape)
(244, 6)
# 244 remaining

print(avg_salary_by_filtered_title_naics_df.head(20))
shape: (20, 6)
┌─────────────────┬─────────────────┬─────────────┬───────────────┬────────────────┬───────────────┐
│ TITLE_NAME      ┆ NAICS_2022_6_NA ┆ Occurrences ┆ Avg_Salary    ┆ Avg_Salary_Fro ┆ Avg_Salary_To │
│ ---             ┆ ME              ┆ ---         ┆ ---           ┆ m              ┆ ---           │
│ str             ┆ ---             ┆ i64         ┆ f64           ┆ ---            ┆ f64           │
│                 ┆ str             ┆             ┆               ┆ f64            ┆               │
╞═════════════════╪═════════════════╪═════════════╪═══════════════╪════════════════╪═══════════════╡
│ Data and        ┆ Telemarketing   ┆ 65          ┆ 81366.523077  ┆ 64625.384615   ┆ 97198.0       │
│ Reporting       ┆ Bureaus and     ┆             ┆               ┆                ┆               │
│ Analysts        ┆ Othe…           ┆             ┆               ┆                ┆               │
│ Health Data     ┆ Telemarketing   ┆ 51          ┆ 94717.647059  ┆ 75764.705882   ┆ 113670.588235 │
│ Analysts        ┆ Bureaus and     ┆             ┆               ┆                ┆               │
│                 ┆ Othe…           ┆             ┆               ┆                ┆               │
│ Data            ┆ Commercial      ┆ 41          ┆ 172009.756098 ┆ 137607.804878  ┆ 206411.707317 │
│ Integration     ┆ Banking         ┆             ┆               ┆                ┆               │
│ Leads           ┆                 ┆             ┆               ┆                ┆               │
│ Data Governance ┆ Commercial      ┆ 35          ┆ 129052.342857 ┆ 103377.371429  ┆ 154727.314286 │
│ Analysts        ┆ Banking         ┆             ┆               ┆                ┆               │
│ Data Analysts   ┆ Commercial      ┆ 31          ┆ 105933.612903 ┆ 87030.903226   ┆ 124547.258065 │
│                 ┆ Banking         ┆             ┆               ┆                ┆               │
│ Crime Analysts  ┆ Telemarketing   ┆ 25          ┆ 46635.28      ┆ 44214.4        ┆ 48977.6       │
│                 ┆ Bureaus and     ┆             ┆               ┆                ┆               │
│                 ┆ Othe…           ┆             ┆               ┆                ┆               │
│ Lead Business   ┆ Commercial      ┆ 23          ┆ 113100.0      ┆ 86840.0        ┆ 139360.0      │
│ Intelligence    ┆ Banking         ┆             ┆               ┆                ┆               │
│ Ana…            ┆                 ┆             ┆               ┆                ┆               │
│ Data Management ┆ Commercial      ┆ 20          ┆ 159859.65     ┆ 124353.0       ┆ 195366.3      │
│ Analysts        ┆ Banking         ┆             ┆               ┆                ┆               │
│ Business        ┆ Commercial      ┆ 19          ┆ 109138.526316 ┆ 88355.842105   ┆ 129921.210526 │
│ Intelligence    ┆ Banking         ┆             ┆               ┆                ┆               │
│ Analysts        ┆                 ┆             ┆               ┆                ┆               │
│ Data Quality    ┆ Commercial      ┆ 17          ┆ 107957.117647 ┆ 85142.588235   ┆ 130771.705882 │
│ Analysts        ┆ Banking         ┆             ┆               ┆                ┆               │
│ Data Analytics  ┆ Commercial      ┆ 16          ┆ 167973.3125   ┆ 134563.0       ┆ 201383.625    │
│ Leads           ┆ Banking         ┆             ┆               ┆                ┆               │
│ Enterprise Risk ┆ Commercial      ┆ 16          ┆ 108521.25     ┆ 81160.4375     ┆ 135528.6875   │
│ Analysts        ┆ Banking         ┆             ┆               ┆                ┆               │
│ Data Analysts   ┆ Regulation and  ┆ 14          ┆ 93407.285714  ┆ 73797.285714   ┆ 113018.285714 │
│                 ┆ Administration  ┆             ┆               ┆                ┆               │
│                 ┆ …               ┆             ┆               ┆                ┆               │
│ Data Warehouse  ┆ Commercial      ┆ 14          ┆ 86984.714286  ┆ 66977.0        ┆ 106312.0      │
│ Business        ┆ Banking         ┆             ┆               ┆                ┆               │
│ Analys…         ┆                 ┆             ┆               ┆                ┆               │
│ Data Quality    ┆ Commercial      ┆ 13          ┆ 156542.230769 ┆ 129509.615385  ┆ 183574.846154 │
│ Leads           ┆ Banking         ┆             ┆               ┆                ┆               │
│ Business        ┆ Investment      ┆ 12          ┆ 125392.5      ┆ 92790.75       ┆ 157995.0      │
│ Intelligence    ┆ Banking and     ┆             ┆               ┆                ┆               │
│ Analysts        ┆ Securit…        ┆             ┆               ┆                ┆               │
│ Business        ┆ Telemarketing   ┆ 11          ┆ 107000.0      ┆ 84927.272727   ┆ 129072.727273 │
│ Intelligence    ┆ Bureaus and     ┆             ┆               ┆                ┆               │
│ Analysts        ┆ Othe…           ┆             ┆               ┆                ┆               │
│ Data Analysts   ┆ Investment      ┆ 10          ┆ 82393.0       ┆ 70166.0        ┆ 94620.0       │
│                 ┆ Banking and     ┆             ┆               ┆                ┆               │
│                 ┆ Securit…        ┆             ┆               ┆                ┆               │
│ Data Analytics  ┆ Commercial      ┆ 10          ┆ 101205.0      ┆ 88850.8        ┆ 113559.2      │
│ Analysts        ┆ Banking         ┆             ┆               ┆                ┆               │
│ Data            ┆ Commercial      ┆ 10          ┆ 115110.0      ┆ 92377.0        ┆ 137843.0      │
│ Integration     ┆ Banking         ┆             ┆               ┆                ┆               │
│ Analysts        ┆                 ┆             ┆               ┆                ┆               │
└─────────────────┴─────────────────┴─────────────┴───────────────┴────────────────┴───────────────┘

plot 1

avg_salary_by_filtered_title_naics_df = pl.DataFrame(result, schema=["TITLE_NAME", "NAICS_2022_6_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")

avg_salary_by_filtered_title_naics_df_pd = avg_salary_by_filtered_title_naics_df.to_pandas()

top_20_df = avg_salary_by_filtered_title_naics_df_pd.head(20)

fig = px.bar(top_20_df, 
             x='TITLE_NAME', 
             y='Occurrences', 
             color='NAICS_2022_6_NAME', 
             title="Top 20 Job Titles by Occurrences and their Industry with Applied Filters",
             labels={'TITLE_NAME': 'Job Title', 'Occurrences': 'Number of Occurrences'},
             hover_data=['Avg_Salary', 'Avg_Salary_From', 'Avg_Salary_To'])

fig.update_layout(xaxis_tickangle=45)

fig.show()

plot 2

query_avg_salary_by_filtered_title_naics_state = """
SELECT 
    TITLE_NAME,
    NAICS_2022_6_NAME,
    STATE_NAME,
    COUNT(TITLE_NAME) AS Occurrences,
    AVG(SALARY) AS Avg_Salary,
    AVG(SALARY_FROM) AS Avg_Salary_From,
    AVG(SALARY_TO) AS Avg_Salary_To
FROM jobs
WHERE SALARY IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM IS NOT NULL
  AND (TITLE_NAME LIKE '%analyst%'
   OR TITLE_NAME LIKE '%analytics%'
   OR TITLE_NAME LIKE '%data%')
  AND (NAICS_2022_6_NAME LIKE '%finance%'
   OR NAICS_2022_6_NAME LIKE '%banking%'
   OR NAICS_2022_6_NAME LIKE '%investing%'
   OR NAICS_2022_6_NAME LIKE '%investments%'
   OR NAICS_2022_6_NAME LIKE '%marketing%'
   OR NAICS_2022_6_NAME LIKE '%transportation%')
GROUP BY STATE_NAME
ORDER BY Occurrences DESC
LIMIT 7;
"""

cursor = conn.cursor()
cursor.execute(query_avg_salary_by_filtered_title_naics_state)
result = cursor.fetchall()

pl.Config.set_tbl_rows(250) 
pl.Config.set_tbl_cols(None)

avg_salary_by_filtered_title_naics_state_df = pl.DataFrame(result, schema=["TITLE_NAME", "NAICS_2022_6_NAME", "STATE_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")

avg_salary_by_filtered_title_naics_state_df_pd = avg_salary_by_filtered_title_naics_state_df.to_pandas()


fig = px.pie(avg_salary_by_filtered_title_naics_state_df_pd,
             names='STATE_NAME',
             values='Occurrences',
             title="Top 7 States by Occurrences with Applied Filters",
             color='STATE_NAME')

fig.show()

plot 3

avg_salary_by_filtered_title_naics_df_pd = avg_salary_by_filtered_title_naics_df.to_pandas()

fig = px.treemap(
    avg_salary_by_filtered_title_naics_df_pd,
    path=['NAICS_2022_6_NAME', 'TITLE_NAME'], 
    values='Occurrences',                      
    color='Avg_Salary',                        
    color_continuous_scale='Viridis',       
    title='Analyst Positions within Selected Industries with Recorded Salary Information'
)

fig.update_layout(
    margin=dict(t=50, l=25, r=25, b=25),
    coloraxis_colorbar=dict(
        title="Avg Salary",
        tickprefix="$"
    )
)

fig.show()
conn.close()